AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  # filter(!is.na(Family)) |> 
  na.omit() |>
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.777
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23052563
        (OOB) Normalized Brier score: 0.92210253
                           (OOB) AUC: 0.65475128
                      (OOB) Log-loss: 0.6518979
                        (OOB) PR-AUC: 0.61704468
                        (OOB) G-mean: 0.60961421
   (OOB) Requested performance error: 0.39038579

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  705 364      0.3405
       Yes 488 630      0.4365

      (OOB) Misclassification rate: 0.3895748
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.777
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23052563
        (OOB) Normalized Brier score: 0.92210253
                           (OOB) AUC: 0.65475128
                      (OOB) Log-loss: 0.6518979
                        (OOB) PR-AUC: 0.61704468
                        (OOB) G-mean: 0.60961421
   (OOB) Requested performance error: 0.39038579

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  705 364      0.3405
       Yes 488 630      0.4365

      (OOB) Misclassification rate: 0.3895748
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
Age                        0.0316   NA    NA
Ethnicity                  0.0278   NA    NA
EnglishSpeak               0.0130   NA    NA
See Family                 0.0094   NA    NA
Similar Values             0.0080   NA    NA
Family Respect             0.0060   NA    NA
Community Trust            0.0044   NA    NA
EnglishDiff                0.0042   NA    NA
Gender                     0.0040   NA    NA
Get Along                  0.0035   NA    NA
Helpful Community          0.0033   NA    NA
Religion                   0.0032   NA    NA
Family Pride               0.0031   NA    NA
See Friends                0.0028   NA    NA
Spend Time Together        0.0024   NA    NA
Close Friends              0.0023   NA    NA
Trust                      0.0023   NA    NA
Religious Importance       0.0020   NA    NA
Income_median              0.0019   NA    NA
Close-knit Community       0.0017   NA    NA
Community Shares Values    0.0015   NA    NA
Loyalty                    0.0015   NA    NA
Feel Close                 0.0008   NA    NA
Expression                 0.0003   NA    NA
Religious Attendance      -0.0006   NA    NA
Togetherness              -0.0009   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0277891088 NA  NA
Age                      0.0316126279 NA  NA
Gender                   0.0040003578 NA  NA
Religion                 0.0031809281 NA  NA
Employment              -0.0019746339 NA  NA
Income_median            0.0019383649 NA  NA
EnglishSpeak             0.0130168401 NA  NA
EnglishDiff              0.0042021061 NA  NA
See Family               0.0093535682 NA  NA
Close Family            -0.0012422567 NA  NA
Helpful Family          -0.0046550437 NA  NA
See Friends              0.0028436753 NA  NA
Close Friends            0.0023006026 NA  NA
Helpful Friends         -0.0075217884 NA  NA
Family Respect           0.0060014684 NA  NA
Similar Values           0.0080321841 NA  NA
Successful Family       -0.0055411576 NA  NA
Trust                    0.0022647824 NA  NA
Loyalty                  0.0014531945 NA  NA
Family Pride             0.0031305696 NA  NA
Expression               0.0002745701 NA  NA
Spend Time Together      0.0024204779 NA  NA
Feel Close               0.0008158354 NA  NA
Togetherness            -0.0009161691 NA  NA
Religious Attendance    -0.0006420113 NA  NA
Religious Importance     0.0019934441 NA  NA
Close-knit Community     0.0016836881 NA  NA
Helpful Community        0.0032857974 NA  NA
Community Shares Values  0.0015075418 NA  NA
Get Along                0.0035156076 NA  NA
Community Trust          0.0044315729 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")

set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.75,list=F,times=1)
train <- rfdata[imbal_index,]
test<- rfdata[-imbal_index,]

rfsrc(Family~.,data=train, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
# rfobj <- imbalanced(Family ~ .,importance=T,data=train,
#                     perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1641
           Frequency of class labels: 802, 839
                     Number of trees: 500
           Forest terminal node size: 1
       Average no. of terminal nodes: 361.26
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1037
                            Analysis: RF-C
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0461
                   (OOB) Brier score: 0.23541641
        (OOB) Normalized Brier score: 0.94166563
                           (OOB) AUC: 0.63563863
                      (OOB) Log-loss: 0.66299671
                        (OOB) PR-AUC: 0.59083227
                        (OOB) G-mean: 0.59246736
   (OOB) Requested performance error: 0.40819021

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  481 321      0.4002
       Yes 350 489      0.4172

      (OOB) Misclassification rate: 0.408897
print(rfobj)
                         Sample size: 1641
           Frequency of class labels: 802, 839
                     Number of trees: 500
           Forest terminal node size: 1
       Average no. of terminal nodes: 361.26
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1037
                            Analysis: RF-C
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0461
                   (OOB) Brier score: 0.23541641
        (OOB) Normalized Brier score: 0.94166563
                           (OOB) AUC: 0.63563863
                      (OOB) Log-loss: 0.66299671
                        (OOB) PR-AUC: 0.59083227
                        (OOB) G-mean: 0.59246736
   (OOB) Requested performance error: 0.40819021

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  481 321      0.4002
       Yes 350 489      0.4172

      (OOB) Misclassification rate: 0.408897
plot(rfobj,plots.one.page = FALSE)


                             all   No   Yes
Age                       0.0256   NA    NA
Ethnicity                 0.0124   NA    NA
See Friends               0.0071   NA    NA
Helpful Family            0.0056   NA    NA
Close Family              0.0049   NA    NA
Get Along                 0.0037   NA    NA
Spend Time Together       0.0034   NA    NA
EnglishDiff               0.0027   NA    NA
Close-knit Community      0.0027   NA    NA
EnglishSpeak              0.0024   NA    NA
Family Respect            0.0020   NA    NA
Feel Close                0.0019   NA    NA
Similar Values            0.0019   NA    NA
Trust                     0.0019   NA    NA
Togetherness              0.0016   NA    NA
Helpful Community         0.0015   NA    NA
Community Shares Values   0.0015   NA    NA
Expression                0.0013   NA    NA
Loyalty                   0.0011   NA    NA
Close Friends             0.0011   NA    NA
Helpful Friends           0.0009   NA    NA
Gender                    0.0009   NA    NA
Religious Importance      0.0007   NA    NA
Community Trust           0.0007   NA    NA
Income_median             0.0006   NA    NA
Employment                0.0005   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0124488512 NA  NA
Age                      0.0256417238 NA  NA
Gender                   0.0008643104 NA  NA
Religion                 0.0003919468 NA  NA
Employment               0.0005279885 NA  NA
Income_median            0.0005553531 NA  NA
EnglishSpeak             0.0023720548 NA  NA
EnglishDiff              0.0027370131 NA  NA
See Family              -0.0003238129 NA  NA
Close Family             0.0049270472 NA  NA
Helpful Family           0.0055638915 NA  NA
See Friends              0.0070965402 NA  NA
Close Friends            0.0010713659 NA  NA
Helpful Friends          0.0009139973 NA  NA
Family Respect           0.0020377689 NA  NA
Similar Values           0.0018846736 NA  NA
Successful Family       -0.0007818162 NA  NA
Trust                    0.0018536063 NA  NA
Loyalty                  0.0010989735 NA  NA
Family Pride             0.0004530480 NA  NA
Expression               0.0013203062 NA  NA
Spend Time Together      0.0034320537 NA  NA
Feel Close               0.0019263946 NA  NA
Togetherness             0.0016238509 NA  NA
Religious Attendance    -0.0005669903 NA  NA
Religious Importance     0.0007134177 NA  NA
Close-knit Community     0.0026958162 NA  NA
Helpful Community        0.0015000579 NA  NA
Community Shares Values  0.0014710505 NA  NA
Get Along                0.0037314878 NA  NA
Community Trust          0.0006983894 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
279.0000000 267.0000000   1.0449438   0.4890110   0.6441948   0.5555556 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5810811   0.6200000   0.4010989   0.2342201   0.9368802   0.6376102 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6595606   0.6110124   0.5982508   0.4890110   0.6059950   0.6046241 
 F1modgmean       gmean 
  0.5982433   0.5982357 
test_rf$importance
                                  all No Yes
Ethnicity                0.0155882728 NA  NA
Age                      0.0335115708 NA  NA
Gender                   0.0048163780 NA  NA
Religion                 0.0051198457 NA  NA
Employment               0.0004498480 NA  NA
Income_median            0.0033055926 NA  NA
EnglishSpeak             0.0021304424 NA  NA
EnglishDiff              0.0115481464 NA  NA
See Family               0.0031161807 NA  NA
Close Family             0.0107349956 NA  NA
Helpful Family           0.0101640966 NA  NA
See Friends              0.0045042555 NA  NA
Close Friends            0.0057056810 NA  NA
Helpful Friends          0.0062768016 NA  NA
Family Respect           0.0045915047 NA  NA
Similar Values           0.0032233405 NA  NA
Successful Family        0.0023504195 NA  NA
Trust                    0.0019277831 NA  NA
Loyalty                  0.0035133417 NA  NA
Family Pride            -0.0003255301 NA  NA
Expression               0.0013490105 NA  NA
Spend Time Together      0.0006522597 NA  NA
Feel Close               0.0024226706 NA  NA
Togetherness             0.0024323453 NA  NA
Religious Attendance     0.0011744121 NA  NA
Religious Importance    -0.0014256837 NA  NA
Close-knit Community    -0.0011670545 NA  NA
Helpful Community        0.0061264047 NA  NA
Community Shares Values  0.0045657712 NA  NA
Get Along                0.0001436143 NA  NA
Community Trust          0.0027542658 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() 

imbalanced(`Heal Professionals` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")->rfobj

print(rfobj)
                         Sample size: 2188
           Frequency of class labels: 1067, 1121
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 482.5363
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0506
                   (OOB) Brier score: 0.23144405
        (OOB) Normalized Brier score: 0.92577618
                           (OOB) AUC: 0.65773693
                      (OOB) Log-loss: 0.65531206
                        (OOB) PR-AUC: 0.62771089
                        (OOB) G-mean: 0.61988973
   (OOB) Requested performance error: 0.38011027

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  670 397      0.3721
       Yes 435 686      0.3880

      (OOB) Misclassification rate: 0.3802559
plot(rfobj,plots.one.page = FALSE)


                             all   No   Yes
EnglishSpeak              0.0289   NA    NA
Close Friends             0.0137   NA    NA
Income_median             0.0123   NA    NA
See Friends               0.0091   NA    NA
Age                       0.0077   NA    NA
Community Trust           0.0064   NA    NA
Religious Attendance      0.0064   NA    NA
See Family                0.0059   NA    NA
Close Family              0.0055   NA    NA
Employment                0.0046   NA    NA
Community Shares Values   0.0041   NA    NA
Expression                0.0037   NA    NA
Helpful Family            0.0037   NA    NA
Helpful Community         0.0032   NA    NA
Close-knit Community      0.0032   NA    NA
EnglishDiff               0.0028   NA    NA
Trust                     0.0024   NA    NA
Family Pride              0.0024   NA    NA
Religious Importance      0.0023   NA    NA
Family Respect            0.0018   NA    NA
Similar Values            0.0014   NA    NA
Gender                    0.0009   NA    NA
Togetherness              0.0006   NA    NA
Feel Close                0.0006   NA    NA
Successful Family         0.0001   NA    NA
Religion                  0.0000   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -1.677537e-03 NA  NA
Age                      7.733131e-03 NA  NA
Gender                   8.772761e-04 NA  NA
Religion                 3.843916e-05 NA  NA
Employment               4.561907e-03 NA  NA
Income_median            1.230494e-02 NA  NA
EnglishSpeak             2.885256e-02 NA  NA
EnglishDiff              2.767643e-03 NA  NA
See Family               5.928918e-03 NA  NA
Close Family             5.511011e-03 NA  NA
Helpful Family           3.711948e-03 NA  NA
See Friends              9.133932e-03 NA  NA
Close Friends            1.367838e-02 NA  NA
Helpful Friends         -4.988185e-04 NA  NA
Family Respect           1.791640e-03 NA  NA
Similar Values           1.402887e-03 NA  NA
Successful Family        7.081082e-05 NA  NA
Trust                    2.405354e-03 NA  NA
Loyalty                 -4.967590e-03 NA  NA
Family Pride             2.385045e-03 NA  NA
Expression               3.728908e-03 NA  NA
Spend Time Together     -4.533935e-03 NA  NA
Feel Close               5.761629e-04 NA  NA
Togetherness             6.207130e-04 NA  NA
Religious Attendance     6.357330e-03 NA  NA
Religious Importance     2.263205e-03 NA  NA
Close-knit Community     3.177590e-03 NA  NA
Helpful Community        3.185724e-03 NA  NA
Community Shares Values  4.146285e-03 NA  NA
Get Along               -2.654628e-03 NA  NA
Community Trust          6.374364e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`==0)

set.seed(222)
imbal_index <- createDataPartition(rfdata$`Heal Professionals`,p=0.75,list=F,times=1)
train <- rfdata[imbal_index,]
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Heal Professionals` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1642
           Frequency of class labels: 801, 841
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 362.0227
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1038
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0499
                   (OOB) Brier score: 0.23293394
        (OOB) Normalized Brier score: 0.93173574
                           (OOB) AUC: 0.65048668
                      (OOB) Log-loss: 0.65855641
                        (OOB) PR-AUC: 0.62608067
                        (OOB) G-mean: 0.60369638
   (OOB) Requested performance error: 0.39630362

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  492 309      0.3858
       Yes 342 499      0.4067

      (OOB) Misclassification rate: 0.3964677
print(rfobj)
                         Sample size: 1642
           Frequency of class labels: 801, 841
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 362.0227
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1038
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0499
                   (OOB) Brier score: 0.23293394
        (OOB) Normalized Brier score: 0.93173574
                           (OOB) AUC: 0.65048668
                      (OOB) Log-loss: 0.65855641
                        (OOB) PR-AUC: 0.62608067
                        (OOB) G-mean: 0.60369638
   (OOB) Requested performance error: 0.39630362

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  492 309      0.3858
       Yes 342 499      0.4067

      (OOB) Misclassification rate: 0.3964677
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
See Friends                0.0104   NA    NA
Income_median              0.0094   NA    NA
EnglishSpeak               0.0092   NA    NA
Religious Attendance       0.0067   NA    NA
Community Shares Values    0.0039   NA    NA
Helpful Community          0.0025   NA    NA
Close Friends              0.0024   NA    NA
Age                        0.0018   NA    NA
Community Trust            0.0018   NA    NA
Religion                   0.0018   NA    NA
Close-knit Community       0.0000   NA    NA
Family Respect            -0.0006   NA    NA
Gender                    -0.0006   NA    NA
Trust                     -0.0012   NA    NA
Get Along                 -0.0023   NA    NA
Feel Close                -0.0030   NA    NA
Close Family              -0.0030   NA    NA
EnglishDiff               -0.0035   NA    NA
Helpful Friends           -0.0036   NA    NA
Family Pride              -0.0042   NA    NA
Spend Time Together       -0.0042   NA    NA
Togetherness              -0.0048   NA    NA
Helpful Family            -0.0049   NA    NA
Successful Family         -0.0054   NA    NA
Expression                -0.0055   NA    NA
Ethnicity                 -0.0066   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -6.617697e-03 NA  NA
Age                      1.823620e-03 NA  NA
Gender                  -6.193423e-04 NA  NA
Religion                 1.817454e-03 NA  NA
Employment              -7.934585e-03 NA  NA
Income_median            9.442515e-03 NA  NA
EnglishSpeak             9.223977e-03 NA  NA
EnglishDiff             -3.508588e-03 NA  NA
See Family              -1.524012e-02 NA  NA
Close Family            -3.016992e-03 NA  NA
Helpful Family          -4.851720e-03 NA  NA
See Friends              1.036126e-02 NA  NA
Close Friends            2.422024e-03 NA  NA
Helpful Friends         -3.645479e-03 NA  NA
Family Respect          -6.193423e-04 NA  NA
Similar Values          -7.929731e-03 NA  NA
Successful Family       -5.419827e-03 NA  NA
Trust                   -1.225781e-03 NA  NA
Loyalty                 -7.875119e-03 NA  NA
Family Pride            -4.235469e-03 NA  NA
Expression              -5.496590e-03 NA  NA
Spend Time Together     -4.248899e-03 NA  NA
Feel Close              -2.982737e-03 NA  NA
Togetherness            -4.754138e-03 NA  NA
Religious Attendance     6.730834e-03 NA  NA
Religious Importance    -1.036653e-02 NA  NA
Close-knit Community     2.213112e-05 NA  NA
Helpful Community        2.533134e-03 NA  NA
Community Shares Values  3.900228e-03 NA  NA
Get Along               -2.316820e-03 NA  NA
Community Trust          1.817454e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
280.0000000 266.0000000   1.0526316   0.4871795   0.6654135   0.6214286 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6254417   0.6615970   0.3571429   0.2282976   0.9131906   0.6750101 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6490824   0.6448087   0.6428404   0.4871795   0.6409420   0.6439269 
 F1modgmean       gmean 
  0.6429427   0.6430451 
test_rf$importance
                                  all No Yes
Ethnicity                0.0091830578 NA  NA
Age                      0.0033678960 NA  NA
Gender                  -0.0001164933 NA  NA
Religion                 0.0058558745 NA  NA
Employment              -0.0007179757 NA  NA
Income_median            0.0106624244 NA  NA
EnglishSpeak             0.0283652914 NA  NA
EnglishDiff              0.0090379447 NA  NA
See Family               0.0034879006 NA  NA
Close Family             0.0003110750 NA  NA
Helpful Family           0.0008932684 NA  NA
See Friends              0.0060705775 NA  NA
Close Friends            0.0054451949 NA  NA
Helpful Friends          0.0030227785 NA  NA
Family Respect          -0.0002876884 NA  NA
Similar Values           0.0013104850 NA  NA
Successful Family        0.0022066150 NA  NA
Trust                    0.0015505719 NA  NA
Loyalty                  0.0018317141 NA  NA
Family Pride             0.0016332540 NA  NA
Expression               0.0021782431 NA  NA
Spend Time Together      0.0000550326 NA  NA
Feel Close               0.0005144554 NA  NA
Togetherness            -0.0004880158 NA  NA
Religious Attendance     0.0024238396 NA  NA
Religious Importance     0.0003718744 NA  NA
Close-knit Community     0.0007995488 NA  NA
Helpful Community        0.0016636832 NA  NA
Community Shares Values  0.0011167880 NA  NA
Get Along                0.0002869137 NA  NA
Community Trust          0.0027582174 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2189
           Frequency of class labels: 292, 1897
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 258.8407
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4966
                   (OOB) Brier score: 0.10382762
        (OOB) Normalized Brier score: 0.41531048
                           (OOB) AUC: 0.74522678
                      (OOB) Log-loss: 0.34769219
                        (OOB) PR-AUC: 0.33319584
                        (OOB) G-mean: 0.66543552
   (OOB) Requested performance error: 0.33456448

Confusion matrix:

          predicted
  observed   0  Yes class.error
       0   224   68      0.2329
       Yes 802 1095      0.4228

      (OOB) Misclassification rate: 0.3974418
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1897.0000000  292.0000000    6.4965753    0.1333942    0.7671233    0.5772272 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2183236    0.9415305    0.3974418    0.1038276    0.4153105    0.7452268 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.3476922    0.3399090    0.4609118    0.1333942    0.3331958    0.5026722 
  F1modgmean        gmean 
   0.5631737    0.6654355 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
# 
# 
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1642
           Frequency of class labels: 809, 833
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 212.829
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1038
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0297
                   (OOB) Brier score: 0.07131577
        (OOB) Normalized Brier score: 0.28526307
                           (OOB) AUC: 0.98914077
                      (OOB) Log-loss: 0.27271079
                        (OOB) PR-AUC: 0.9888072
                        (OOB) G-mean: 0.94745674
   (OOB) Requested performance error: 0.05254326

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 745  64      0.0791
       0    21 812      0.0252

      (OOB) Misclassification rate: 0.05176614
print(rfobj)
                         Sample size: 1642
           Frequency of class labels: 809, 833
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 212.829
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1038
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0297
                   (OOB) Brier score: 0.07131577
        (OOB) Normalized Brier score: 0.28526307
                           (OOB) AUC: 0.98914077
                      (OOB) Log-loss: 0.27271079
                        (OOB) PR-AUC: 0.9888072
                        (OOB) G-mean: 0.94745674
   (OOB) Requested performance error: 0.05254326

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 745  64      0.0791
       0    21 812      0.0252

      (OOB) Misclassification rate: 0.05176614
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Ethnicity                 0.0251    NA   NA
Religion                  0.0174    NA   NA
EnglishSpeak              0.0164    NA   NA
Income_median             0.0146    NA   NA
Employment                0.0135    NA   NA
Religious.Attendance      0.0134    NA   NA
EnglishDiff               0.0108    NA   NA
Helpful.Community         0.0099    NA   NA
Religious.Importance      0.0090    NA   NA
Close.knit.Community      0.0077    NA   NA
Community.Trust           0.0076    NA   NA
Helpful.Family            0.0072    NA   NA
Get.Along                 0.0070    NA   NA
See.Friends               0.0066    NA   NA
Age                       0.0056    NA   NA
Community.Shares.Values   0.0054    NA   NA
Gender                    0.0054    NA   NA
Similar.Values            0.0049    NA   NA
Helpful.Friends           0.0049    NA   NA
Feel.Close                0.0041    NA   NA
Expression                0.0041    NA   NA
Close.Friends             0.0031    NA   NA
Family.Respect            0.0030    NA   NA
Togetherness              0.0030    NA   NA
Close.Family              0.0025    NA   NA
Successful.Family         0.0024    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                0.0250746224  NA NA
Age                      0.0055677537  NA NA
Gender                   0.0053598163  NA NA
Religion                 0.0173662061  NA NA
Employment               0.0135148040  NA NA
Income_median            0.0145521135  NA NA
EnglishSpeak             0.0164197923  NA NA
EnglishDiff              0.0108215914  NA NA
See.Family               0.0004832958  NA NA
Close.Family             0.0024919682  NA NA
Helpful.Family           0.0072131811  NA NA
See.Friends              0.0066286342  NA NA
Close.Friends            0.0030747402  NA NA
Helpful.Friends          0.0048771687  NA NA
Family.Respect           0.0030221032  NA NA
Similar.Values           0.0048771687  NA NA
Successful.Family        0.0024385785  NA NA
Trust                    0.0012192878  NA NA
Loyalty                  0.0011675380  NA NA
Family.Pride             0.0011675380  NA NA
Expression               0.0041406824  NA NA
Spend.Time.Together     -0.0001017975  NA NA
Feel.Close               0.0041406824  NA NA
Togetherness             0.0029710402  NA NA
Religious.Attendance     0.0133599027  NA NA
Religious.Importance     0.0089690063  NA NA
Close.knit.Community     0.0077033446  NA NA
Helpful.Community        0.0099316404  NA NA
Community.Shares.Values  0.0054094335  NA NA
Get.Along                0.0069843678  NA NA
Community.Trust          0.0076149226  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
474.0000000  73.0000000   6.4931507   0.1334552   0.9589041   0.1962025 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1552106   0.9687500   0.7020110   0.1512833   0.6051333   0.6777065 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.4682492   0.2671756   0.2937991   0.1334552   0.2236314   0.3504630 
 F1modgmean       gmean 
  0.3637747   0.4337504 
test_rf$importance
                                  all Yes  0
Ethnicity               -1.043120e-02  NA NA
Age                      8.892604e-03  NA NA
Gender                   3.965169e-03  NA NA
Religion                 7.176509e-03  NA NA
Employment               5.141174e-02  NA NA
Income_median            9.433785e-02  NA NA
EnglishSpeak             4.047334e-02  NA NA
EnglishDiff              2.793042e-03  NA NA
See.Family              -4.845609e-03  NA NA
Close.Family            -4.077925e-03  NA NA
Helpful.Family          -7.243761e-03  NA NA
See.Friends             -6.401686e-03  NA NA
Close.Friends           -1.795732e-03  NA NA
Helpful.Friends         -3.495917e-03  NA NA
Family.Respect          -1.679729e-03  NA NA
Similar.Values          -1.288285e-04  NA NA
Successful.Family        8.413456e-05  NA NA
Trust                   -3.280590e-03  NA NA
Loyalty                  1.298347e-03  NA NA
Family.Pride             3.210424e-03  NA NA
Expression               3.092767e-03  NA NA
Spend.Time.Together     -3.346290e-03  NA NA
Feel.Close               2.436198e-03  NA NA
Togetherness             1.673344e-03  NA NA
Religious.Attendance     7.593028e-03  NA NA
Religious.Importance     2.386402e-03  NA NA
Close.knit.Community     1.967129e-02  NA NA
Helpful.Community        3.244624e-02  NA NA
Community.Shares.Values  1.121431e-02  NA NA
Get.Along                1.284339e-02  NA NA
Community.Trust          1.648031e-02  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2184
           Frequency of class labels: 849, 1335
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 397.6117
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1380
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5724
                   (OOB) Brier score: 0.1774567
        (OOB) Normalized Brier score: 0.70982678
                           (OOB) AUC: 0.79679575
                      (OOB) Log-loss: 0.53238987
                        (OOB) PR-AUC: 0.7016539
                        (OOB) G-mean: 0.73581395
   (OOB) Requested performance error: 0.26418605

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   648 201      0.2367
       Yes 388 947      0.2906

      (OOB) Misclassification rate: 0.2696886
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1335.0000000  849.0000000    1.5724382    0.3887363    0.7632509    0.7093633 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6254826    0.8249129    0.2696886    0.1774567    0.7098268    0.7967957 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.5323899    0.6875332    0.7232077    0.3887363    0.7016539    0.7116736 
  F1modgmean        gmean 
   0.7295108    0.7358139 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1639
           Frequency of class labels: 807, 832
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 237.8013
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1036
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.031
                   (OOB) Brier score: 0.10888745
        (OOB) Normalized Brier score: 0.43554981
                           (OOB) AUC: 0.94193758
                      (OOB) Log-loss: 0.36875306
                        (OOB) PR-AUC: 0.93722367
                        (OOB) G-mean: 0.87409144
   (OOB) Requested performance error: 0.12590856

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 697 110      0.1363
       0    96 736      0.1154

      (OOB) Misclassification rate: 0.1256864
print(rfobj)
                         Sample size: 1639
           Frequency of class labels: 807, 832
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 237.8013
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1036
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.031
                   (OOB) Brier score: 0.10888745
        (OOB) Normalized Brier score: 0.43554981
                           (OOB) AUC: 0.94193758
                      (OOB) Log-loss: 0.36875306
                        (OOB) PR-AUC: 0.93722367
                        (OOB) G-mean: 0.87409144
   (OOB) Requested performance error: 0.12590856

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 697 110      0.1363
       0    96 736      0.1154

      (OOB) Misclassification rate: 0.1256864
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Income_median             0.0391    NA   NA
EnglishSpeak              0.0217    NA   NA
Employment                0.0186    NA   NA
Religion                  0.0177    NA   NA
EnglishDiff               0.0157    NA   NA
Ethnicity                 0.0145    NA   NA
Religious.Importance      0.0108    NA   NA
Religious.Attendance      0.0096    NA   NA
Get.Along                 0.0060    NA   NA
Community.Trust           0.0049    NA   NA
Helpful.Family            0.0048    NA   NA
Close.Friends             0.0042    NA   NA
Age                       0.0035    NA   NA
Community.Shares.Values   0.0030    NA   NA
Spend.Time.Together       0.0029    NA   NA
Gender                    0.0025    NA   NA
Close.knit.Community      0.0024    NA   NA
Family.Respect            0.0024    NA   NA
Togetherness              0.0023    NA   NA
Family.Pride              0.0023    NA   NA
Loyalty                   0.0023    NA   NA
Expression                0.0018    NA   NA
Trust                     0.0018    NA   NA
Close.Family              0.0012    NA   NA
Feel.Close                0.0012    NA   NA
Successful.Family         0.0012    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                1.450301e-02  NA NA
Age                      3.493177e-03  NA NA
Gender                   2.475878e-03  NA NA
Religion                 1.768573e-02  NA NA
Employment               1.862361e-02  NA NA
Income_median            3.914102e-02  NA NA
EnglishSpeak             2.173959e-02  NA NA
EnglishDiff              1.569938e-02  NA NA
See.Family               1.157721e-03  NA NA
Close.Family             1.220851e-03  NA NA
Helpful.Family           4.763477e-03  NA NA
See.Friends              4.780771e-04  NA NA
Close.Friends            4.224828e-03  NA NA
Helpful.Friends         -6.304234e-05  NA NA
Family.Respect           2.378485e-03  NA NA
Similar.Values          -9.200622e-05  NA NA
Successful.Family        1.157721e-03  NA NA
Trust                    1.814843e-03  NA NA
Loyalty                  2.322104e-03  NA NA
Family.Pride             2.349440e-03  NA NA
Expression               1.814843e-03  NA NA
Spend.Time.Together      2.945910e-03  NA NA
Feel.Close               1.157721e-03  NA NA
Togetherness             2.349440e-03  NA NA
Religious.Attendance     9.573872e-03  NA NA
Religious.Importance     1.081689e-02  NA NA
Close.knit.Community     2.409239e-03  NA NA
Helpful.Community        4.780771e-04  NA NA
Community.Shares.Values  2.974120e-03  NA NA
Get.Along                5.984164e-03  NA NA
Community.Trust          4.883412e-03  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
333.0000000 212.0000000   1.5707547   0.3889908   0.8113208   0.5705706 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5460317   0.8260870   0.3357798   0.1882569   0.7530276   0.7770377 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.5578571   0.6527514   0.6636678   0.3889908   0.6956436   0.6665653 
 F1modgmean       gmean 
  0.6720235   0.6803791 
test_rf$importance
                                  all Yes  0
Ethnicity                7.803448e-03  NA NA
Age                      8.560578e-03  NA NA
Gender                  -9.026692e-04  NA NA
Religion                 3.745040e-03  NA NA
Employment               1.977079e-02  NA NA
Income_median            7.812789e-02  NA NA
EnglishSpeak             2.458840e-02  NA NA
EnglishDiff              4.502704e-03  NA NA
See.Family               4.685076e-05  NA NA
Close.Family             6.759530e-05  NA NA
Helpful.Family          -5.780287e-06  NA NA
See.Friends             -3.527108e-03  NA NA
Close.Friends           -2.801842e-04  NA NA
Helpful.Friends          1.002257e-03  NA NA
Family.Respect          -6.529591e-04  NA NA
Similar.Values           2.538353e-05  NA NA
Successful.Family        1.098977e-03  NA NA
Trust                    9.871958e-05  NA NA
Loyalty                  1.004083e-03  NA NA
Family.Pride            -8.173244e-04  NA NA
Expression               1.551758e-03  NA NA
Spend.Time.Together     -8.461073e-05  NA NA
Feel.Close               6.082376e-04  NA NA
Togetherness            -7.650017e-04  NA NA
Religious.Attendance     1.515353e-03  NA NA
Religious.Importance    -3.489919e-04  NA NA
Close.knit.Community     5.696589e-04  NA NA
Helpful.Community       -5.733979e-04  NA NA
Community.Shares.Values  6.702416e-04  NA NA
Get.Along                2.021478e-03  NA NA
Community.Trust         -9.908891e-06  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2178
           Frequency of class labels: 704, 1474
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 425.687
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1376
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.0938
                   (OOB) Brier score: 0.19839145
        (OOB) Normalized Brier score: 0.7935658
                           (OOB) AUC: 0.68816397
                      (OOB) Log-loss: 0.58312274
                        (OOB) PR-AUC: 0.50025633
                        (OOB) G-mean: 0.6427269
   (OOB) Requested performance error: 0.3572731

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   495 209      0.2969
       Yes 608 866      0.4125

      (OOB) Misclassification rate: 0.3751148
plot(imb,plots.one.page = F)


                           all    0   Yes
Age                     0.0529   NA    NA
Income_median           0.0250   NA    NA
Employment              0.0139   NA    NA
Ethnicity               0.0136   NA    NA
EnglishSpeak            0.0079   NA    NA
Gender                  0.0063   NA    NA
Religion                0.0056   NA    NA
EnglishDiff             0.0053   NA    NA
Helpful.Community       0.0030   NA    NA
Expression              0.0029   NA    NA
Close.knit.Community    0.0026   NA    NA
Feel.Close              0.0025   NA    NA
Religious.Importance    0.0024   NA    NA
Togetherness            0.0008   NA    NA
See.Friends             0.0005   NA    NA
Close.Family            0.0000   NA    NA
See.Family             -0.0002   NA    NA
Loyalty                -0.0006   NA    NA
Helpful.Family         -0.0006   NA    NA
Get.Along              -0.0014   NA    NA
Religious.Attendance   -0.0016   NA    NA
Community.Trust        -0.0018   NA    NA
Similar.Values         -0.0019   NA    NA
Helpful.Friends        -0.0021   NA    NA
Family.Pride           -0.0021   NA    NA
Family.Respect         -0.0023   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1474.0000000  704.0000000    2.0937500    0.3232323    0.7031250    0.5875170 
        prec          npv     misclass        brier   brier.norm          auc 
   0.4487761    0.8055814    0.3751148    0.1983915    0.7935658    0.6881640 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.5831227    0.5478694    0.6066191    0.3232323    0.5002563    0.5952981 
  F1modgmean        gmean 
   0.6246730    0.6427269 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1634
           Frequency of class labels: 803, 831
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 269.5033
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1033
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0349
                   (OOB) Brier score: 0.13265442
        (OOB) Normalized Brier score: 0.53061769
                           (OOB) AUC: 0.93201562
                      (OOB) Log-loss: 0.43239367
                        (OOB) PR-AUC: 0.92794805
                        (OOB) G-mean: 0.84812364
   (OOB) Requested performance error: 0.15187636

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 677 126      0.1569
       0   122 709      0.1468

      (OOB) Misclassification rate: 0.1517748
print(rfobj)
                         Sample size: 1634
           Frequency of class labels: 803, 831
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 269.5033
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1033
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0349
                   (OOB) Brier score: 0.13265442
        (OOB) Normalized Brier score: 0.53061769
                           (OOB) AUC: 0.93201562
                      (OOB) Log-loss: 0.43239367
                        (OOB) PR-AUC: 0.92794805
                        (OOB) G-mean: 0.84812364
   (OOB) Requested performance error: 0.15187636

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 677 126      0.1569
       0   122 709      0.1468

      (OOB) Misclassification rate: 0.1517748
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
EnglishDiff               0.0384    NA   NA
Ethnicity                 0.0249    NA   NA
Income_median             0.0191    NA   NA
Close.knit.Community      0.0151    NA   NA
Age                       0.0141    NA   NA
Religious.Attendance      0.0139    NA   NA
Gender                    0.0128    NA   NA
Employment                0.0120    NA   NA
Similar.Values            0.0109    NA   NA
Community.Trust           0.0102    NA   NA
Get.Along                 0.0096    NA   NA
Successful.Family         0.0090    NA   NA
Community.Shares.Values   0.0090    NA   NA
Religion                  0.0090    NA   NA
Religious.Importance      0.0078    NA   NA
Expression                0.0072    NA   NA
EnglishSpeak              0.0072    NA   NA
Trust                     0.0066    NA   NA
Togetherness              0.0060    NA   NA
Helpful.Community         0.0060    NA   NA
Helpful.Friends           0.0055    NA   NA
Feel.Close                0.0047    NA   NA
Helpful.Family            0.0042    NA   NA
Family.Respect            0.0041    NA   NA
See.Family                0.0036    NA   NA
Loyalty                   0.0030    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                0.0248968775  NA NA
Age                      0.0140963705  NA NA
Gender                   0.0128162244  NA NA
Religion                 0.0089571442  NA NA
Employment               0.0120478229  NA NA
Income_median            0.0190921015  NA NA
EnglishSpeak             0.0071910506  NA NA
EnglishDiff              0.0384020696  NA NA
See.Family               0.0036459869  NA NA
Close.Family             0.0023719132  NA NA
Helpful.Family           0.0041971735  NA NA
See.Friends              0.0005718011  NA NA
Close.Friends            0.0011714134  NA NA
Helpful.Friends          0.0055246937  NA NA
Family.Respect           0.0041385762  NA NA
Similar.Values           0.0109202600  NA NA
Successful.Family        0.0090357236  NA NA
Trust                    0.0066049572  NA NA
Loyalty                  0.0030206794  NA NA
Family.Pride            -0.0001192607  NA NA
Expression               0.0072266925  NA NA
Spend.Time.Together      0.0010864867  NA NA
Feel.Close               0.0047425012  NA NA
Togetherness             0.0060023659  NA NA
Religious.Attendance     0.0139328764  NA NA
Religious.Importance     0.0077953860  NA NA
Close.knit.Community     0.0150870035  NA NA
Helpful.Community        0.0059667757  NA NA
Community.Shares.Values  0.0089607159  NA NA
Get.Along                0.0095895593  NA NA
Community.Trust          0.0101974130  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
368.0000000 176.0000000   2.0909091   0.3235294   0.9147727   0.2472826 
       prec         npv    misclass       brier  brier.norm         auc 
  0.3675799   0.8584906   0.5367647   0.2206503   0.8826012   0.6578249 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6316164   0.5244300   0.4433383   0.3235294   0.4660592   0.5000213 
 F1modgmean       gmean 
  0.4594755   0.4756126 
test_rf$importance
                                  all Yes  0
Ethnicity                0.0114439693  NA NA
Age                      0.0219713731  NA NA
Gender                   0.0016219260  NA NA
Religion                 0.0027888994  NA NA
Employment               0.0062918989  NA NA
Income_median            0.0229205076  NA NA
EnglishSpeak             0.0033919262  NA NA
EnglishDiff             -0.0003063299  NA NA
See.Family              -0.0002408119  NA NA
Close.Family            -0.0007071544  NA NA
Helpful.Family           0.0003203322  NA NA
See.Friends              0.0011020651  NA NA
Close.Friends           -0.0033020947  NA NA
Helpful.Friends         -0.0002238848  NA NA
Family.Respect           0.0012973660  NA NA
Similar.Values           0.0001397280  NA NA
Successful.Family        0.0029397819  NA NA
Trust                    0.0006694806  NA NA
Loyalty                 -0.0003052977  NA NA
Family.Pride             0.0049864195  NA NA
Expression               0.0030384659  NA NA
Spend.Time.Together      0.0020002278  NA NA
Feel.Close               0.0027576521  NA NA
Togetherness             0.0018777620  NA NA
Religious.Attendance    -0.0019438390  NA NA
Religious.Importance     0.0035575453  NA NA
Close.knit.Community     0.0048280191  NA NA
Helpful.Community        0.0030374539  NA NA
Community.Shares.Values  0.0058025090  NA NA
Get.Along                0.0021470728  NA NA
Community.Trust          0.0021712026  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2175
           Frequency of class labels: 896, 1279
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 450.5877
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1375
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4275
                   (OOB) Brier score: 0.21235502
        (OOB) Normalized Brier score: 0.8494201
                           (OOB) AUC: 0.70813816
                      (OOB) Log-loss: 0.61336314
                        (OOB) PR-AUC: 0.60231866
                        (OOB) G-mean: 0.64945053
   (OOB) Requested performance error: 0.35054947

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   608 288      0.3214
       Yes 484 795      0.3784

      (OOB) Misclassification rate: 0.3549425
plot(imb,plots.one.page = F)


                              all    0   Yes
EnglishSpeak               0.0180   NA    NA
Close.Family               0.0057   NA    NA
Feel.Close                 0.0050   NA    NA
Helpful.Family             0.0049   NA    NA
Family.Pride               0.0040   NA    NA
Employment                 0.0036   NA    NA
Gender                     0.0034   NA    NA
Successful.Family          0.0021   NA    NA
Expression                 0.0016   NA    NA
Close.Friends              0.0005   NA    NA
See.Friends                0.0005   NA    NA
Age                        0.0002   NA    NA
Ethnicity                 -0.0013   NA    NA
Close.knit.Community      -0.0016   NA    NA
Community.Shares.Values   -0.0018   NA    NA
Similar.Values            -0.0019   NA    NA
Community.Trust           -0.0022   NA    NA
Spend.Time.Together       -0.0025   NA    NA
Togetherness              -0.0027   NA    NA
Religious.Importance      -0.0033   NA    NA
EnglishDiff               -0.0034   NA    NA
Helpful.Friends           -0.0038   NA    NA
Loyalty                   -0.0039   NA    NA
Trust                     -0.0043   NA    NA
Income_median             -0.0043   NA    NA
Get.Along                 -0.0044   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1279.0000000  896.0000000    1.4274554    0.4119540    0.6785714    0.6215794 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5567766    0.7340720    0.3549425    0.2123550    0.8494201    0.7081382 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.6133631    0.6116700    0.6409429    0.4119540    0.6023187    0.6305603 
  F1modgmean        gmean 
   0.6451967    0.6494505 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1632
           Frequency of class labels: 802, 830
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 268.4903
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1031
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0349
                   (OOB) Brier score: 0.13575662
        (OOB) Normalized Brier score: 0.54302647
                           (OOB) AUC: 0.91982243
                      (OOB) Log-loss: 0.44019353
                        (OOB) PR-AUC: 0.91304878
                        (OOB) G-mean: 0.84832876
   (OOB) Requested performance error: 0.15167124

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 670 132      0.1646
       0   115 715      0.1386

      (OOB) Misclassification rate: 0.151348
print(rfobj)
                         Sample size: 1632
           Frequency of class labels: 802, 830
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 268.4903
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1031
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0349
                   (OOB) Brier score: 0.13575662
        (OOB) Normalized Brier score: 0.54302647
                           (OOB) AUC: 0.91982243
                      (OOB) Log-loss: 0.44019353
                        (OOB) PR-AUC: 0.91304878
                        (OOB) G-mean: 0.84832876
   (OOB) Requested performance error: 0.15167124

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 670 132      0.1646
       0   115 715      0.1386

      (OOB) Misclassification rate: 0.151348
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Ethnicity                 0.0327    NA   NA
Income_median             0.0303    NA   NA
Religion                  0.0265    NA   NA
EnglishSpeak              0.0229    NA   NA
Gender                    0.0158    NA   NA
EnglishDiff               0.0156    NA   NA
Community.Shares.Values   0.0104    NA   NA
Close.Friends             0.0091    NA   NA
Get.Along                 0.0090    NA   NA
Close.knit.Community      0.0080    NA   NA
Community.Trust           0.0078    NA   NA
Religious.Attendance      0.0077    NA   NA
Successful.Family         0.0077    NA   NA
Employment                0.0076    NA   NA
Religious.Importance      0.0074    NA   NA
Helpful.Family            0.0066    NA   NA
Age                       0.0061    NA   NA
Close.Family              0.0049    NA   NA
Helpful.Community         0.0042    NA   NA
See.Friends               0.0042    NA   NA
Similar.Values            0.0042    NA   NA
Family.Pride              0.0036    NA   NA
Feel.Close                0.0029    NA   NA
Spend.Time.Together       0.0023    NA   NA
Expression                0.0023    NA   NA
Family.Respect            0.0023    NA   NA
rfobj$importance
                                 all Yes  0
Ethnicity                0.032663624  NA NA
Age                      0.006131614  NA NA
Gender                   0.015828603  NA NA
Religion                 0.026518680  NA NA
Employment               0.007647381  NA NA
Income_median            0.030294863  NA NA
EnglishSpeak             0.022921737  NA NA
EnglishDiff              0.015550752  NA NA
See.Family              -0.005575811  NA NA
Close.Family             0.004905287  NA NA
Helpful.Family           0.006649953  NA NA
See.Friends              0.004197575  NA NA
Close.Friends            0.009102336  NA NA
Helpful.Friends          0.001150065  NA NA
Family.Respect           0.002307021  NA NA
Similar.Values           0.004162872  NA NA
Successful.Family        0.007747457  NA NA
Trust                    0.001114601  NA NA
Loyalty                  0.001745203  NA NA
Family.Pride             0.003639835  NA NA
Expression               0.002307021  NA NA
Spend.Time.Together      0.002340760  NA NA
Feel.Close               0.002903862  NA NA
Togetherness             0.001080911  NA NA
Religious.Attendance     0.007747457  NA NA
Religious.Importance     0.007357942  NA NA
Close.knit.Community     0.007991442  NA NA
Helpful.Community        0.004234059  NA NA
Community.Shares.Values  0.010365281  NA NA
Get.Along                0.009034317  NA NA
Community.Trust          0.007776946  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
319.0000000 224.0000000   1.4241071   0.4125230   0.7767857   0.4514107 
       prec         npv    misclass       brier  brier.norm         auc 
  0.4985673   0.7422680   0.4143646   0.2330666   0.9322663   0.6526009 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6598971   0.6073298   0.5834643   0.4125230   0.5200296   0.5997432 
 F1modgmean       gmean 
  0.5878104   0.5921565 
test_rf$importance
                                  all Yes  0
Ethnicity                0.0190548830  NA NA
Age                     -0.0008363182  NA NA
Gender                   0.0021456891  NA NA
Religion                 0.0149186629  NA NA
Employment               0.0061462960  NA NA
Income_median            0.0291057705  NA NA
EnglishSpeak             0.0138818279  NA NA
EnglishDiff              0.0093890066  NA NA
See.Family               0.0009114775  NA NA
Close.Family            -0.0012305512  NA NA
Helpful.Family          -0.0012548631  NA NA
See.Friends             -0.0018495515  NA NA
Close.Friends           -0.0025569592  NA NA
Helpful.Friends         -0.0002722153  NA NA
Family.Respect           0.0044396784  NA NA
Similar.Values          -0.0003858598  NA NA
Successful.Family        0.0025583855  NA NA
Trust                    0.0011096618  NA NA
Loyalty                  0.0014748035  NA NA
Family.Pride             0.0009339725  NA NA
Expression               0.0006243612  NA NA
Spend.Time.Together      0.0001531622  NA NA
Feel.Close               0.0021725962  NA NA
Togetherness             0.0006275521  NA NA
Religious.Attendance     0.0028457122  NA NA
Religious.Importance    -0.0003142404  NA NA
Close.knit.Community    -0.0020620379  NA NA
Helpful.Community       -0.0004374439  NA NA
Community.Shares.Values  0.0004516296  NA NA
Get.Along               -0.0002527406  NA NA
Community.Trust          0.0018926823  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2167
           Frequency of class labels: 1808, 359
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 325.9177
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1370
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 5.0362
                   (OOB) Brier score: 0.13716597
        (OOB) Normalized Brier score: 0.5486639
                           (OOB) AUC: 0.59096526
                      (OOB) Log-loss: 0.44659192
                        (OOB) PR-AUC: 0.23502521
                        (OOB) G-mean: 0.55299713
   (OOB) Requested performance error: 0.44700287

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   863 945      0.5227
       Yes 129 230      0.3593

      (OOB) Misclassification rate: 0.4956161
plot(imb,plots.one.page = F)


                           all    0   Yes
Close.Family            0.0224   NA    NA
Ethnicity               0.0184   NA    NA
Income_median           0.0175   NA    NA
Helpful.Family          0.0138   NA    NA
Religion                0.0115   NA    NA
Helpful.Friends         0.0108   NA    NA
Helpful.Community       0.0095   NA    NA
Close.Friends           0.0094   NA    NA
Age                     0.0091   NA    NA
Get.Along               0.0082   NA    NA
Loyalty                 0.0082   NA    NA
Community.Trust         0.0080   NA    NA
Family.Pride            0.0075   NA    NA
Religious.Importance    0.0072   NA    NA
Trust                   0.0070   NA    NA
See.Friends             0.0054   NA    NA
Successful.Family       0.0053   NA    NA
Similar.Values          0.0047   NA    NA
See.Family              0.0035   NA    NA
Family.Respect          0.0032   NA    NA
Spend.Time.Together     0.0030   NA    NA
Feel.Close              0.0029   NA    NA
EnglishDiff             0.0028   NA    NA
EnglishSpeak            0.0010   NA    NA
Employment             -0.0006   NA    NA
Religious.Attendance   -0.0010   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1808.0000000  359.0000000    5.0362117    0.1656668    0.6406685    0.4773230 
        prec          npv     misclass        brier   brier.norm          auc 
   0.1957447    0.8699597    0.4956161    0.1371660    0.5486639    0.5909653 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.4465919    0.2998696    0.4034674    0.1656668    0.2350252    0.4264334 
  F1modgmean        gmean 
   0.4782322    0.5529971 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1626
           Frequency of class labels: 800, 826
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 257.0357
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1028
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0325
                   (OOB) Brier score: 0.10820287
        (OOB) Normalized Brier score: 0.43281149
                           (OOB) AUC: 0.97188105
                      (OOB) Log-loss: 0.37643219
                        (OOB) PR-AUC: 0.96438371
                        (OOB) G-mean: 0.91746819
   (OOB) Requested performance error: 0.08253181

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   729  71      0.0887
       Yes  63 763      0.0763

      (OOB) Misclassification rate: 0.08241082
print(rfobj)
                         Sample size: 1626
           Frequency of class labels: 800, 826
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 257.0357
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1028
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0325
                   (OOB) Brier score: 0.10820287
        (OOB) Normalized Brier score: 0.43281149
                           (OOB) AUC: 0.97188105
                      (OOB) Log-loss: 0.37643219
                        (OOB) PR-AUC: 0.96438371
                        (OOB) G-mean: 0.91746819
   (OOB) Requested performance error: 0.08253181

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   729  71      0.0887
       Yes  63 763      0.0763

      (OOB) Misclassification rate: 0.08241082
plot(rfobj,plots.one.page = FALSE)


                             all    0   Yes
Ethnicity                 0.0170   NA    NA
Religion                  0.0170   NA    NA
Religious.Attendance      0.0158   NA    NA
EnglishDiff               0.0127   NA    NA
Get.Along                 0.0103   NA    NA
Helpful.Family            0.0092   NA    NA
Community.Trust           0.0085   NA    NA
Community.Shares.Values   0.0084   NA    NA
EnglishSpeak              0.0079   NA    NA
Income_median             0.0071   NA    NA
Religious.Importance      0.0066   NA    NA
Helpful.Community         0.0066   NA    NA
Employment                0.0060   NA    NA
Close.Friends             0.0037   NA    NA
Close.knit.Community      0.0035   NA    NA
See.Friends               0.0030   NA    NA
Feel.Close                0.0030   NA    NA
Loyalty                   0.0029   NA    NA
Successful.Family         0.0029   NA    NA
Family.Respect            0.0023   NA    NA
Togetherness              0.0017   NA    NA
Close.Family              0.0012   NA    NA
Similar.Values            0.0011   NA    NA
Trust                     0.0011   NA    NA
Spend.Time.Together       0.0010   NA    NA
Helpful.Friends           0.0006   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                1.699750e-02 NA  NA
Age                     -7.669618e-05 NA  NA
Gender                  -7.032206e-04 NA  NA
Religion                 1.698406e-02 NA  NA
Employment               5.953208e-03 NA  NA
Income_median            7.143534e-03 NA  NA
EnglishSpeak             7.887759e-03 NA  NA
EnglishDiff              1.267868e-02 NA  NA
See.Family              -9.896162e-05 NA  NA
Close.Family             1.203237e-03 NA  NA
Helpful.Family           9.163088e-03 NA  NA
See.Friends              3.035886e-03 NA  NA
Close.Friends            3.664146e-03 NA  NA
Helpful.Friends          5.750132e-04 NA  NA
Family.Respect           2.341081e-03 NA  NA
Similar.Values           1.110752e-03 NA  NA
Successful.Family        2.928322e-03 NA  NA
Trust                    1.074421e-03 NA  NA
Loyalty                  2.946524e-03 NA  NA
Family.Pride             4.858915e-04 NA  NA
Expression              -2.048803e-03 NA  NA
Spend.Time.Together      1.044696e-03 NA  NA
Feel.Close               2.966382e-03 NA  NA
Togetherness             1.681505e-03 NA  NA
Religious.Attendance     1.575473e-02 NA  NA
Religious.Importance     6.574343e-03 NA  NA
Close.knit.Community     3.505178e-03 NA  NA
Helpful.Community        6.562713e-03 NA  NA
Community.Shares.Values  8.402837e-03 NA  NA
Get.Along                1.026504e-02 NA  NA
Community.Trust          8.536022e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
  n.majority   n.minority       iratio    threshold         sens         spec 
452.00000000  89.00000000   5.07865169   0.16451017   0.97752809   0.01327434 
        prec          npv     misclass        brier   brier.norm          auc 
  0.16322702   0.75000000   0.82809612   0.19174317   0.76697270   0.62026449 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
  0.57101833   0.27974277   0.04772353   0.16451017   0.24471152   0.19682759 
  F1modgmean        gmean 
  0.08081797   0.11391241 
test_rf$importance
                                  all  0 Yes
Ethnicity                0.0092679961 NA  NA
Age                     -0.0010239624 NA  NA
Gender                  -0.0003857482 NA  NA
Religion                 0.0039876778 NA  NA
Employment               0.0025635146 NA  NA
Income_median            0.0116989391 NA  NA
EnglishSpeak             0.0252844657 NA  NA
EnglishDiff              0.0032710913 NA  NA
See.Family               0.0009597281 NA  NA
Close.Family             0.0053102624 NA  NA
Helpful.Family          -0.0020859738 NA  NA
See.Friends             -0.0006427318 NA  NA
Close.Friends            0.0028938441 NA  NA
Helpful.Friends         -0.0042392218 NA  NA
Family.Respect           0.0014365929 NA  NA
Similar.Values           0.0030089055 NA  NA
Successful.Family        0.0049482951 NA  NA
Trust                    0.0085479527 NA  NA
Loyalty                  0.0054304756 NA  NA
Family.Pride             0.0034385977 NA  NA
Expression               0.0091406593 NA  NA
Spend.Time.Together      0.0116566550 NA  NA
Feel.Close               0.0023862014 NA  NA
Togetherness             0.0139434459 NA  NA
Religious.Attendance     0.0062598150 NA  NA
Religious.Importance     0.0035382193 NA  NA
Close.knit.Community     0.0044157273 NA  NA
Helpful.Community        0.0143630787 NA  NA
Community.Shares.Values  0.0001754873 NA  NA
Get.Along               -0.0063640012 NA  NA
Community.Trust         -0.0015271133 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2152
           Frequency of class labels: 1866, 286
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 271.2393
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1360
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.5245
                   (OOB) Brier score: 0.11194129
        (OOB) Normalized Brier score: 0.44776514
                           (OOB) AUC: 0.66867163
                      (OOB) Log-loss: 0.37567305
                        (OOB) PR-AUC: 0.21700153
                        (OOB) G-mean: 0.62960131
   (OOB) Requested performance error: 0.37039869

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0   1037 829      0.4443
       Yes   82 204      0.2867

      (OOB) Misclassification rate: 0.4233271
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0537   NA    NA
Ethnicity                  0.0494   NA    NA
EnglishSpeak               0.0124   NA    NA
Trust                      0.0118   NA    NA
Religion                   0.0113   NA    NA
Community.Trust            0.0110   NA    NA
EnglishDiff                0.0108   NA    NA
Feel.Close                 0.0102   NA    NA
Spend.Time.Together        0.0096   NA    NA
Similar.Values             0.0092   NA    NA
Loyalty                    0.0086   NA    NA
Community.Shares.Values    0.0083   NA    NA
Family.Respect             0.0068   NA    NA
Religious.Importance       0.0065   NA    NA
Successful.Family          0.0062   NA    NA
Helpful.Community          0.0061   NA    NA
Helpful.Friends            0.0059   NA    NA
Family.Pride               0.0048   NA    NA
Income_median              0.0040   NA    NA
Employment                 0.0040   NA    NA
Expression                 0.0028   NA    NA
Gender                     0.0018   NA    NA
Togetherness               0.0009   NA    NA
See.Family                 0.0001   NA    NA
Close.knit.Community       0.0000   NA    NA
See.Friends               -0.0004   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1866.0000000  286.0000000    6.5244755    0.1328996    0.7132867    0.5557342 
        prec          npv     misclass        brier   brier.norm          auc 
   0.1974831    0.9267203    0.4233271    0.1119413    0.4477651    0.6686716 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.3756730    0.3093252    0.4280739    0.1328996    0.2170015    0.4694633 
  F1modgmean        gmean 
   0.5288376    0.6296013 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1615
           Frequency of class labels: 793, 822
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 231.3443
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1021
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0366
                   (OOB) Brier score: 0.0867378
        (OOB) Normalized Brier score: 0.34695121
                           (OOB) AUC: 0.98107682
                      (OOB) Log-loss: 0.31961798
                        (OOB) PR-AUC: 0.97903926
                        (OOB) G-mean: 0.92703296
   (OOB) Requested performance error: 0.07296704

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   710  83      0.1047
       Yes  33 789      0.0401

      (OOB) Misclassification rate: 0.07182663
print(rfobj)
                         Sample size: 1615
           Frequency of class labels: 793, 822
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 231.3443
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1021
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0366
                   (OOB) Brier score: 0.0867378
        (OOB) Normalized Brier score: 0.34695121
                           (OOB) AUC: 0.98107682
                      (OOB) Log-loss: 0.31961798
                        (OOB) PR-AUC: 0.97903926
                        (OOB) G-mean: 0.92703296
   (OOB) Requested performance error: 0.07296704

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   710  83      0.1047
       Yes  33 789      0.0401

      (OOB) Misclassification rate: 0.07182663
plot(rfobj,plots.one.page = FALSE)


                           all    0   Yes
EnglishSpeak            0.0117   NA    NA
Religious.Importance    0.0106   NA    NA
Community.Trust         0.0089   NA    NA
Religious.Attendance    0.0088   NA    NA
Religion                0.0075   NA    NA
Age                     0.0071   NA    NA
Ethnicity               0.0068   NA    NA
Get.Along               0.0057   NA    NA
Close.Family            0.0055   NA    NA
EnglishDiff             0.0044   NA    NA
Family.Pride            0.0037   NA    NA
See.Friends             0.0035   NA    NA
Income_median           0.0029   NA    NA
Employment              0.0020   NA    NA
Gender                  0.0018   NA    NA
Feel.Close              0.0017   NA    NA
Spend.Time.Together     0.0011   NA    NA
Successful.Family       0.0010   NA    NA
Close.knit.Community    0.0009   NA    NA
Helpful.Community       0.0008   NA    NA
Trust                   0.0007   NA    NA
Loyalty                 0.0006   NA    NA
Family.Respect          0.0005   NA    NA
Helpful.Family          0.0005   NA    NA
See.Family              0.0004   NA    NA
Helpful.Friends        -0.0003   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                0.0067640720 NA  NA
Age                      0.0070766909 NA  NA
Gender                   0.0017640986 NA  NA
Religion                 0.0075320542 NA  NA
Employment               0.0020095165 NA  NA
Income_median            0.0028814427 NA  NA
EnglishSpeak             0.0116840910 NA  NA
EnglishDiff              0.0043874728 NA  NA
See.Family               0.0004013891 NA  NA
Close.Family             0.0054846916 NA  NA
Helpful.Family           0.0004618194 NA  NA
See.Friends              0.0034717663 NA  NA
Close.Friends           -0.0014286934 NA  NA
Helpful.Friends         -0.0003060976 NA  NA
Family.Respect           0.0004618194 NA  NA
Similar.Values          -0.0015459999 NA  NA
Successful.Family        0.0009917884 NA  NA
Trust                    0.0007201414 NA  NA
Loyalty                  0.0005876596 NA  NA
Family.Pride             0.0037209612 NA  NA
Expression              -0.0013675560 NA  NA
Spend.Time.Together      0.0011127301 NA  NA
Feel.Close               0.0017019253 NA  NA
Togetherness            -0.0014286934 NA  NA
Religious.Attendance     0.0087508013 NA  NA
Religious.Importance     0.0105853089 NA  NA
Close.knit.Community     0.0008774882 NA  NA
Helpful.Community        0.0008228279 NA  NA
Community.Shares.Values -0.0004698596 NA  NA
Get.Along                0.0056769849 NA  NA
Community.Trust          0.0089086889 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf)
  n.majority   n.minority       iratio    threshold         sens         spec 
466.00000000  71.00000000   6.56338028   0.13221601   1.00000000   0.03218884 
        prec          npv     misclass        brier   brier.norm          auc 
  0.13601533   1.00000000   0.83985102   0.17768100   0.71072400   0.63974944 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
  0.53556189   0.23946037   0.09896390   0.13221601   0.20345993   0.20943643 
  F1modgmean        gmean 
  0.13918819   0.17941249 
test_rf$importance
                                  all  0 Yes
Ethnicity                4.873851e-02 NA  NA
Age                      3.857721e-02 NA  NA
Gender                  -3.894706e-05 NA  NA
Religion                 8.982006e-03 NA  NA
Employment               7.597281e-03 NA  NA
Income_median            6.111832e-03 NA  NA
EnglishSpeak             1.610365e-02 NA  NA
EnglishDiff              1.057510e-02 NA  NA
See.Family              -1.988875e-03 NA  NA
Close.Family            -4.847550e-03 NA  NA
Helpful.Family          -1.618896e-03 NA  NA
See.Friends             -7.924134e-03 NA  NA
Close.Friends           -3.134457e-03 NA  NA
Helpful.Friends         -1.694438e-03 NA  NA
Family.Respect           5.171665e-03 NA  NA
Similar.Values           1.137199e-02 NA  NA
Successful.Family        6.723931e-04 NA  NA
Trust                   -2.195275e-03 NA  NA
Loyalty                  2.697698e-03 NA  NA
Family.Pride             3.843059e-04 NA  NA
Expression              -2.271595e-05 NA  NA
Spend.Time.Together      3.687656e-03 NA  NA
Feel.Close               1.138885e-03 NA  NA
Togetherness             3.526200e-03 NA  NA
Religious.Attendance    -6.751275e-03 NA  NA
Religious.Importance     3.075361e-03 NA  NA
Close.knit.Community    -5.567672e-03 NA  NA
Helpful.Community        4.667771e-03 NA  NA
Community.Shares.Values  7.458798e-03 NA  NA
Get.Along               -6.150190e-04 NA  NA
Community.Trust          2.991126e-03 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot